'''
File name: project.ipynb
Date created: 03/11/2019
Date last modified: 20/12/2019
Python Version: 3.7.4
''';
import pandas as pd
import numpy as np
import geopandas as gpd
import plotly
import plotly.express as px
import plotly.graph_objs as go
from utils import constants as cst
from utils import clean_database
from utils import areas_handler
from utils import chicago_map_handler as maps
# Set auto-reload
%load_ext autoreload
%autoreload 2
In this section we load and clean the databases. All methods used are described in utils/clean_database.
#Â Load the areas dataframe
areas_DF = gpd.read_file(cst.AREAS_PATH)
# Clean the dataframe
areas_DF = clean_database.clean_areas_df(areas_DF)
areas_DF.head()
# Load the food inspections dataframe
food_inspections_DF = pd.read_csv(cst.FOOD_INSPECTIONS_PATH, sep = ',', header = 0,
names = cst.FOOD_INSPECTIONS_COL_NAMES, index_col = None, error_bad_lines=False
)
# Clean the dataframe
food_inspections_DF = clean_database.clean_food_inspections_df(food_inspections_DF, areas_DF)
food_inspections_DF.head()
# Load the socio-economic indicators dataframe
socio_economic_DF = pd.read_csv(cst.SOCIO_ECONOMIC_INDICATORS_PATH, sep = ',', header = 0,
names = cst.SOCIO_ECONOMIC_COL_NAMES, index_col = None, error_bad_lines=False
)
# Clean the dataframe
socio_economic_DF = clean_database.clean_socio_economic_df(socio_economic_DF)
socio_economic_DF.head()
# Load the life expectancy dataframe
life_expectancy_DF = pd.read_csv(cst.LIFE_EXPECTANCY_PATH, sep = ',', header = 0,
names = cst.LIFE_EXPECTANCY_COL_NAMES, index_col = None, error_bad_lines=False
)
# Clean the dataframe
life_expectancy_DF = clean_database.clean_socio_economic_df(life_expectancy_DF)
life_expectancy_DF.head()
The solutions described are implemented in utils/areas_handler.
# Filter locations where lng/lat are unknown
food_unknown_loc = food_inspections_DF[food_inspections_DF['lat'].isna()]
# Get unknown locations
unknown_locations = areas_handler.get_unknown_locations(food_unknown_loc)
# Check the locations not found by OpenStreetMaps
unknown_locations[pd.isnull(unknown_locations['lat'])].head()
# Display the Chicago areas
# This map contains additional layers to visually check if the found locations are actually within the borders of the city
map_chicago = maps.create_chicago_map(with_community_areas=True)
map_chicago
# Save map
map_chicago.save(outfile=cst.MAPS_PATH+"map_intro.html")
Description of the map
On the map above we can see the city of Chicago and the region where the facilities of the inspections presented in the food_inspections dataset are located. In the following, we will focus ang get some insights on that area.
# Use unknonwn_locations to fill lat and lng in the original dataframe food_inspections_DF
food_unknown_loc = food_unknown_loc.reset_index().merge(unknown_locations, on="address", how='left') \
.set_index('index').drop(['lat_x', 'lng_x'], axis = 1) \
.rename(columns={'lat_y':'lat','lng_y':'lng'})
food_inspections_DF.update(food_unknown_loc)
# Check which latitudes are still unknown
print('%d food inspections still missing lat long info, out of %d. '%(food_inspections_DF.lat.isna().sum(), len(food_inspections_DF)))
So we have no more unknown locations in our food inspections dataframe.
# Resolve are numbers and delete unknown areas
# Takes a while if not already saved
food_inspections_DF = areas_handler.get_locations_with_area(food_inspections_DF, areas_DF)
print("Number of locations: " + str(food_inspections_DF.shape[0]))
# Drop locations not in the city of chicago
food_inspections_DF = food_inspections_DF.dropna(subset=[cst.AREA_NUM])
print("Number of locations in the city of chicago: " + str(food_inspections_DF.shape[0]))
food_inspections_DF[cst.AREA_NUM] = food_inspections_DF[cst.AREA_NUM].astype(int)
# Create new dataframe with number of inspections per area
inspection_counts = food_inspections_DF[cst.AREA_NUM].value_counts().to_frame()
inspection_counts.reset_index(level=0, inplace=True)
inspection_counts.columns=[cst.AREA_NUM,'num_inspections']
inspection_counts[cst.AREA_NUM] = inspection_counts[cst.AREA_NUM].astype(str)
inspection_counts.sort_values('num_inspections');
rename_mapping = {"num_inspections": "number of inspections"}
heatmap_chicago = maps.heat_map(inspection_counts, "Number of Inspections", cst.AREA_NUM, "num_inspections", \
rename_mapping=rename_mapping)
heatmap_chicago
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"num_inspections.html")
Description of the map
The heatmap above shows the number of inspections on the area we are focusing on and determined before. We can see that the more we approach the city center of Chicago, the higher the number of inspections.
However, it is also curious to see that in the north of Chicago, the number of inspections is higher than in the south. This could be due to a higher amount of establishment in this area; we this check below.
Here we determine and analyse the number of establishments per region area.
count_DF = food_inspections_DF.copy()
count_DF = count_DF.drop_duplicates(subset=['license_num'])
count_ser = count_DF[cst.AREA_NUM].value_counts().to_frame()
count_ser.reset_index(level=0, inplace=True)
count_ser.columns=[cst.AREA_NUM,'num_establishments']
count_ser[cst.AREA_NUM] = count_ser[cst.AREA_NUM].astype(str)
count_ser.sort_values('num_establishments');
rename_mapping = {"num_establishments": "number of establishments"}
heatmap_chicago = maps.heat_map(count_ser, "Number of Establishments", cst.AREA_NUM, 'num_establishments', \
rename_mapping=rename_mapping)
heatmap_chicago
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"num_establishments.html")
Description of the map
The heatmap above shows the number of establishments on the area we want to analyze. Same as before, we observe that the more we approach the city center or the north of the city, the higher the number of establishments. Indeed, the number of inspections is correlated with the number of establishments; the more establishments we have on a given area, the higher the inspections will be on that area.
We compute a "risk_ser" dataframe, which has two columns containing the area number and the risk level. To simplify the risk values, we convert the risk in the food inspections dataframe to an integer.
def risk_to_num(x):
if x == 'Risk 3 (Low)':
return 1
if x == 'Risk 2 (Medium)':
return 2
if x == 'Risk 1 (High)':
return 3
if x == 'All':
return 2
else:
return x
risk_DF = food_inspections_DF.copy()
risk_DF = risk_DF.dropna(subset=['risk'])
risk_DF['risk'] = risk_DF['risk'].apply(lambda x : risk_to_num(x)).astype(int)
risk_ser = risk_DF.groupby(cst.AREA_NUM)['risk'].mean().to_frame()
risk_ser.reset_index(level=0, inplace=True)
risk_ser[cst.AREA_NUM] = risk_ser[cst.AREA_NUM].astype(str)
rounding = 3
risk_ser['risk'] = risk_ser['risk'].apply(lambda x: round(x, rounding))
heatmap_chicago = maps.heat_map(risk_ser, "Average risk", cst.AREA_NUM, 'risk')
heatmap_chicago
#Â Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"average_risk.html")
Description of the map
The heatmap above shows the average risk of the establishments for each community area. The risk of an establishment shows how it could potentially affect the public's health with 1 being the highest and 3 the lowest. Furthermore, high risk establishments are inspected more frequently that low risk establishments.
We compute a "result_ser" dataframe, which has two columns containing the area number and the average inspection result in that area. To simplify the result values, we convert the result in the food inspections dataframe to an integer between 0 (Inspection failed) and 2 (inspection passed).
food_inspections_DF.result
def result_to_num(e):
if e == 'Pass':
return 2
if e == 'Pass w/ Conditions':
return 1
if e == 'Fail':
return 0
result_DF = food_inspections_DF.copy()
result_DF['result'] = result_DF['result'].apply(lambda x : result_to_num(x))
result_DF = result_DF.dropna(subset=['result'])
result_DF['result'] = result_DF['result'].astype(int)
result_ser = result_DF.groupby(cst.AREA_NUM)['result'].mean().to_frame()
result_ser.reset_index(level=0, inplace=True)
result_ser[cst.AREA_NUM] = result_ser[cst.AREA_NUM].astype(str)
rounding = 3
result_ser['result'] = result_ser['result'].apply(lambda x: round(x, rounding))
heatmap_chicago = maps.heat_map(result_ser, "Average result", cst.AREA_NUM, 'result', good_indicator=True)
heatmap_chicago
#Â Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"average_result.html")
Description of the map
The heatmap above shows the average risk of the establishments for each community area. The risk of an establishment shows how it could potentially affect the public's health with 1 being the highest and 3 the lowest. Furthermore, high risk establishments are inspected more frequently that low risk establishments.
#inspections_per_est = inspection_counts.merge(count_ser, left_on=cst.AREA_NUM, right_on=cst.AREA_NUM).drop(['index'], axis = 1)
inspections_per_est = inspection_counts.merge(count_ser, left_on=cst.AREA_NUM, right_on=cst.AREA_NUM)
rounding = 3
inspections_per_est['inspections_per_establishment'] = inspections_per_est.apply(lambda x : round(x.num_inspections/x.num_establishments,rounding), axis=1)
rename_mapping = {'inspections_per_establishment':'inspections per establishment'}
heatmap_chicago = maps.heat_map(inspections_per_est, "Number of Inspections per Establishment", cst.AREA_NUM, \
'inspections_per_establishment', rename_mapping=rename_mapping)
heatmap_chicago
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"inspections_per_establishment.html")
Description of the map
The heatmap above shows the number of inspections per establishment and as highlighted before, we can see that the establishments with a high number of inspections present high average risks.
Each establishment can receive a violation number between 1-44 or 70 and the number is followed by a specific description of the findings that caused the violation to be issued. The higher the number, the better the description. Establishments collecting only high numbers might probably pass whereas the others might probably fail.
An inspection of an establishment can pass, pass with conditions or fail depending on these numbers. The 'pass' condition is given when the establishment were found to have no critical or serious violations. Establishments that received a 'pass with conditions' were found to have critical or serious violations but these violations were corrected during the inspection. Finally, the 'fail' condition is issued when the establishment were found to have critical or serious violations that were not correctable during the inspection.
We will analyse more deeply the violations for milestone 3.
# Merge socio-economic and life expectancy df's on the area number and names
socio_life_merged_DF = socio_economic_DF.merge(life_expectancy_DF, how="left", on=["community_area_num", "community_area_name"])
# Select two of the following columns for the heatmap
socio_life_merged_DF[cst.AREA_NUM] = socio_life_merged_DF[cst.AREA_NUM].astype(str)
for c in socio_life_merged_DF.columns:
print(c)
rename_mapping = {'per_capita_income':'per capita income ($)'}
heatmap_chicago = maps.heat_map(socio_life_merged_DF, "Per capita income 2010",cst.AREA_NUM ,'per_capita_income',\
good_indicator = True, rename_mapping=rename_mapping)
heatmap_chicago
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"per_capita_income.html")
rename_mapping = {'life_exp_2010':'life expectancy 2010 (years)'}
heatmap_chicago = maps.heat_map(socio_life_merged_DF, "Life expectancy 2010",cst.AREA_NUM ,'life_exp_2010', \
good_indicator = True, rename_mapping=rename_mapping)
heatmap_chicago
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"life_exp_2010.html")
We could compare the map above, displaying the life expectancies, with the percentage of housholds below the poverty line.
rename_mapping = {'housholds_below_poverty_perc':'housholds below poverty line (perc)'}
heatmap_chicago = maps.heat_map(socio_life_merged_DF, "housholds below poverty line as percentage", cst.AREA_NUM, \
'housholds_below_poverty_perc', rename_mapping=rename_mapping)
heatmap_chicago
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"housholds_below_poverty_perc.html")
In the first map, some community areas have no entry for their life expectancies. Clearly the life expectancies are higher near the center of Chicago, and on it's north side. These are also the regions with lowest amount of housholds below poverty line. Around West Garfield Park, the life expectancy drops drastically, as do the housholds above poverty line.
#Display Top20 inspected facility types
food_inspections_DF["facility_type"].value_counts()[:20]
# Create a filtered dataframe that only contains restaurants and stores
food_inspections_DF = food_inspections_DF.dropna()
filtered_inspections_DF = food_inspections_DF[(food_inspections_DF["facility_type"].str.lower().str.contains("store")) | (food_inspections_DF["facility_type"].str.lower().str.contains("restaurant"))]
count_DF = filtered_inspections_DF.copy()
count_DF = count_DF.drop_duplicates(subset=['license_num'])
count_ser = count_DF[cst.AREA_NUM].value_counts().to_frame()
count_ser.reset_index(level=0, inplace=True)
count_ser.columns=[cst.AREA_NUM,'num_establishments']
count_ser[cst.AREA_NUM] = count_ser[cst.AREA_NUM].astype(str)
count_ser.sort_values('num_establishments');
inspection_counts = filtered_inspections_DF[cst.AREA_NUM].value_counts().to_frame()
inspection_counts.reset_index(level=0, inplace=True)
inspection_counts.columns=[cst.AREA_NUM,'num_inspections']
inspection_counts[cst.AREA_NUM] = inspection_counts[cst.AREA_NUM].astype(str)
inspection_counts.sort_values('num_inspections');
def extract_violation_num(x):
return int(x.split('.')[0])
def result_to_pass(x):
if "pass" in x.lower():
return 0
else:
return 100
failure_DF = filtered_inspections_DF.copy()
failure_DF = failure_DF.dropna(subset=['violations'])
failure_DF = failure_DF[(failure_DF["result"].str.lower().str.contains("pass")) | (failure_DF["result"].str.lower().str.contains("fail"))]
failure_DF['pass_num'] = failure_DF['violations'].apply(lambda x : extract_violation_num(x)).astype(int)
failure_DF['failed inspections (%)'] = failure_DF['result'].apply(lambda x : result_to_pass(x)).astype(int)
failure_ser = failure_DF.groupby(cst.AREA_NUM)['failed inspections (%)'].mean().to_frame()
failure_ser.reset_index(level=0, inplace=True)
failure_ser[cst.AREA_NUM] = failure_ser[cst.AREA_NUM].astype(str)
rounding = 3
failure_ser['failed inspections (%)'] = failure_ser['failed inspections (%)'].apply(lambda x: round(x, rounding))
heatmap_chicago = maps.heat_map(failure_ser, "Percentage of failed inspections for restaurants and stores", cst.AREA_NUM, 'failed inspections (%)')
heatmap_chicago
# Save map
heatmap_chicago.save(outfile=cst.MAPS_PATH+"map_restaurant_pass.html")
The center and north of Chicago mostly have higher life expectancies and lower percentages of households below the poverty line.
With respect to food inspections, these areas also have more restaurants, and more food inspections per restaurant than the south.
# Check: range, mean with confidence interval.
important_columns = ['community_area_num', 'community_area_name', 'housing_crowded_perc',
'housholds_below_poverty_perc', 'aged_16_or_more_unemployed_perc',
'aged_25_or_more_without_high_school_diploma_perc',
'aged_under_18_or_over_64_perc', 'per_capita_income', 'hardship_idx',
'life_exp_2010']
socio_life_merged_DF[important_columns].describe()
We can see above the mean, standart deviations and confidence intervals for some columns that highlight the socio economic factors of all areas. Of course in the database all community areas are considered equally, and so this does not take into account the sizes of the areas, or the number of people in it. However it is still interesting to analyse this result, to get some insight on what we are dealing with. For example it is surprising that the standart deviation of the income per capita is more than half of its mean! Or that the life expectancies in all areas have an std of more than 4 years.
In what follows we analyse more closely the correlations between the different features in this dataframe.
corr = socio_life_merged_DF[cst.SOCIOECONOMIC_METRICS].corr()
corr
We now want to make sure that the correlation between all two pairs of metrics both good or both bad is always negative, and that the correlation between 2 pairs of variables of different type is always negative. For this, we create the boolean variable sign_kept: only if the above condition is always met, then sign_kept will be true.
bad_metrics = set(['housing_crowded_perc', 'housholds_below_poverty_perc', 'aged_16_or_more_unemployed_perc',
'aged_25_or_more_without_high_school_diploma_perc', 'hardship_idx', 'aged_under_18_or_over_64_perc'])
good_metrics = set(['per_capita_income', 'life_exp_2010' ])
sign_kept = True
for c1 in cst.SOCIOECONOMIC_METRICS:
for c2 in cst.SOCIOECONOMIC_METRICS:
if (c1 in bad_metrics and c2 in bad_metrics) or (c1 in good_metrics and c2 in good_metrics):
if corr[c1][c2] < 0:
sign_kept = False
elif (c1 in bad_metrics and c2 in good_metrics) or (c1 in good_metrics and c2 in bad_metrics):
if corr[c1][c2] > 0:
sign_kept = False
if sign_kept:
print('The correlation between indicators both good or both bad is always positive,\n and the correlation between a good and a bad indicator is always negative')
# Set correlation between each variable and itself to None in order to ignore it later
for c in corr.columns:
corr[c][c] = None
corrmax =pd.DataFrame(corr.idxmax()).rename({0: 'Strongest positive correlation'}, axis = 1)
corrmax['Correlation value'] = corr.max()
corrmax
corrmin =pd.DataFrame(corr.idxmin()).rename({0: 'Strongest negative correlation'}, axis = 1)
corrmin['Correlation value'] = corr.min()
corrmin
Of the above correlations, we notice certain things: Firstly, we can classify the indicators between good (life expectancy and per capita income) and bad (percentage of crowded houses, percentage of below porverty households, percentage of over 16 unemployed people, percentage fo over 25 people without a high school diploma, the hardship index, and the percentage of people under 18 and over 64), and the correlations between indicators either both good or both bad will always be positive, whereas the correlation between a good and a bad indicator will always be negative (sign_kept).
We also notice that the percentage of people under 18 or over 64 is a strong negative indicator: it is more negatively correlated to per capita income than, for example, the percentage of houses living below the poverty line.
It is indeed quite surprising that per capita average income is not more correlated to the percentage of houses living below the poverty line (correlation is -0.56). We plot the 2 metrics in order to see this:
One reason the linear correlation is so low is that the relationship is exponential. However, we also notice that thetop 5 highest per capita neighbourhoods are not in the top 15 lowest poor households percentage.
We now plot a graph showing the per capita income on the y-axis and the housholds below poverty on the x-axis, of every community area. The plot is interactive, you can hover over a point to see its community area name and the precise values reported and click on it to see how other areas of the same region perform.
plotly.offline.init_notebook_mode(connected=True)
from utils import interactive_plot_handler
from utils.interactive_plot_handler import region_to_color, name_to_idx, name_to_region
# Add a 'region' column based on # https://en.wikipedia.org/wiki/Chicago#/media/File:Chicago_community_areas_map.svg
areas_DF['region'] = areas_DF['community_area_name'].apply(lambda name: name_to_region[name])
# Display the Chicago regions
# This map contains additional layers to visually check if the found locations are actually within the borders of the city
map_chicago = maps.create_chicago_map(with_community_areas=True, with_regions=True, areas_DF=areas_DF)
map_chicago
# Save map
map_chicago.save(outfile=cst.MAPS_PATH+"map_chicago_regions_colored.html")
# Add a 'region' column based on # https://en.wikipedia.org/wiki/Chicago#/media/File:Chicago_community_areas_map.svg
socio_life_merged_DF['region'] = socio_life_merged_DF['community_area_name'].apply(lambda name: name_to_region[name])
fig = interactive_plot_handler.interactive_plot_capita_income_wrt_households(socio_life_merged_DF)
fig
# Saves the figure depending on which point you click -> replace interactivity by showing right figure on page.
#region = 'all'
#fig_path = cst.FIG_PATH + 'comm_area_per_income_below_poverty_'+ region + '.html'
#fig.write_html(fig_path)
In this section we attempt to predict if an establishment will be inspected in 2019. We therefore define the problem as a binary classification, and, in order to understand the accuracy, we define the period as starting from April 2018.
What we want to see is not so much whether or not we can predict this, but the effect that each factor has. We use Logistic Regression. For the categorical variables, we use one hot encoding. In order to compare the weights and see the importance each feature has in the decision, we use the MinMax scaler.
from datetime import datetime
from tqdm import tqdm_notebook as tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from utils import predictions_helpers
from utils.predictions_helpers import risk_to_num, inspected_after_2018, n_inspections_before_2018, \
results_to_number, int_of_x
food_inspections_DF['risk'] = food_inspections_DF['risk'].apply(risk_to_num)
d = {
'facility_type': lambda l: list(l)[0],
'risk': list,
'inspection_date': list,
'inspection_type': list,
'result': list,
'community_area_num': lambda l: list(l)[0]
}
#build a dataframe in which each row is a facility
establishment_centric = food_inspections_DF.groupby('license_num').agg(d).loc[1:]
#Drop establishments only inspected once
establishment_centric_many = establishment_centric[establishment_centric.risk.apply(len) > 2].reset_index()
#See all possible result to inspections
all_options = set()
for e in establishment_centric_many.result:
for i in e:
all_options.add(i)
print(all_options)
#based on the performance between 2010 - 2017, predict if there is an inspection in 2019
establishment_centric_many['inspected_after_2018'] = establishment_centric_many.inspection_date.apply(inspected_after_2018)
establishment_centric_many['average_risk'] = establishment_centric_many.risk.apply(np.mean)
establishment_centric_many['n_inspections'] = establishment_centric_many.inspection_date.apply(n_inspections_before_2018)
establishment_centric_many['average_result'] = establishment_centric_many.result.apply(results_to_number)
establishment_centric_many = establishment_centric_many[~ establishment_centric_many.average_risk.isna()]
socio_life_merged_DF['community_area_num'] = socio_life_merged_DF.community_area_num.apply(int_of_x)
establishment_centric_many_social = establishment_centric_many.merge(socio_life_merged_DF.dropna(), on = 'community_area_num').reset_index()
#one hot encode facility type
establishment_centric_many_merged = establishment_centric_many_social.reset_index().merge(pd.get_dummies(establishment_centric_many.facility_type).reset_index()\
, on = 'index')
establishment_centric_many_merged['per_capita_income_log'] = establishment_centric_many_merged['per_capita_income'].apply(np.log)
#define the socio economic metrics
bad_metrics = list(['housing_crowded_perc', 'housholds_below_poverty_perc', 'aged_16_or_more_unemployed_perc',
'aged_25_or_more_without_high_school_diploma_perc', 'hardship_idx', 'aged_under_18_or_over_64_perc'])
good_metrics = list(['per_capita_income', 'life_exp_2010' ])
#Let's look at the risk for some facility types:
establishment_centric_many_merged.groupby('facility_type').agg({'risk': lambda x: np.mean([e[0] for e in x])}).sort_values('risk').head(20)
establishment_centric_many_merged.groupby('facility_type').agg({'risk': lambda x: np.mean([-e[0] for e in x])}).sort_values('risk').head(20)
corr_withfood = establishment_centric_many_merged[ good_metrics + bad_metrics + ['average_risk', 'n_inspections', 'average_result']].corr()
corr_withfood
for c in corr_withfood.columns:
corr_withfood[c][c] = None #Avoid bright diagonal
cols = ['per18-64', 'hardship',\
'perNoHighschool', 'Unemployment', 'Per poverty', 'Per crowded', 'Life exp', 'Income'][::-1]
fig = go.Figure(data=go.Heatmap(
z=corr_withfood[['average_result', 'n_inspections', 'average_risk']],
x=['average_result', 'n_inspections', 'average_risk'],
y=cols,
hoverongaps = False,
colorscale=px.colors.diverging.Tealrose))
fig.show()
plotly.offline.plot(fig, filename='../docs/img/corr_socio_good.html', auto_open=False);
features = list(establishment_centric_many.facility_type.unique())
for feature in features:
if(sum(establishment_centric_many_merged[feature])<30):
print('Removing feature... ' + feature)
features.remove(feature)
features += ['average_risk','average_result', 'n_inspections']
establishment_centric_many_merged = establishment_centric_many_merged.dropna(subset=features)
#Make classes completely balanced in the validation dataset by dropping a few inspections
val_set = establishment_centric_many_merged.sort_values('inspected_after_2018')
val_set = val_set.iloc[:-943]
#Train a Logistic regression model
target = 'inspected_after_2018'
val_set = val_set.dropna(subset=features + [target])
#LetÅ› fo min max scaler
#To estimate the coefficients and the accuracy we use Monte Carlo crossvalidation
coefs = []
accs = []
for _ in tqdm(range(100)):
train, test = train_test_split(val_set)
sc = MinMaxScaler()
X = sc.fit_transform(train[features])
reg = LogisticRegression(solver='liblinear') #optimize alpha letter
reg.fit(X, train[target])
coefs.append(reg.coef_[0])
X_test = sc.transform(test[features])
accs.append(accuracy_score(reg.predict(X_test), test[target]))
print('The mean accuracy estimated using Montecarlo cross validation is %1.3f +- %1.3f, compared to a random accuracy of 0.5. '%(np.mean(accs), 1.64*np.std(accs)/np.sqrt(len(accs))))
#Let's find the mean of the coefficients
coef_mean = np.mean(coefs, axis=0)
coef_std = np.std(coefs, axis=0)
coeffs = []
margins = []
names = []
for c, std, f in sorted(zip(coef_mean, coef_std, features), key = lambda x: -x[0]):
names.append(f.upper())
coeffs.append(c)
margins.append(1.64*std/np.sqrt(len(coefs)))
print('%s -Coefficient: %2.3f. -Confidence interval 90 per cent: %f to %f.(%f)'%(f.upper(), c, c-1.64*std/np.sqrt(len(coefs)), c+1.64*std/np.sqrt(len(coefs)),1.64*std/np.sqrt(len(coefs)) ))
fig = go.Figure()
fig.add_trace(go.Bar(
name="Upper",
x=names[:5], y=coeffs[:5],
error_y=dict(type='data', array=margins[:5]),
showlegend=False
))
fig.add_trace(go.Bar(
name="Lower",
x=names[-5:], y=coeffs[-5:],
error_y=dict(type='data', array=margins[-5:]),
showlegend=False
))
fig.update_layout(
margin=dict(l=0, r=0, t=0, b=0),
autosize=True,
width=530,
height=450,
)
fig.show()
plotly.offline.plot(fig, filename='../docs/img/bar_coeff.html', auto_open=False);